Retrieve the list of biggest world population¶

import pandas as pd
import sys
sys.path.append('..')
from data.unlabeled import WORLD_CITIES as wct

wct.head()

Dataset normalization¶

wct.isnull().sum()

city              0
city_ascii        0
lat               0
lng               0
country           0
iso2             31
iso3              0
admin_name       76
capital       18943
population      973
id                0
dtype: int64

len(wct[wct.columns].drop_duplicates()), len(wct[wct.columns]) # no duplicates

(26569, 26569)

wct = wct.drop(columns=["city", "iso2", "iso3", "admin_name", "capital", "id"])

wct.columns

Index(['city_ascii', 'lat', 'lng', 'country', 'population'], dtype='object')

wct = wct.rename(columns={'city_ascii':'city'})

wct.columns

Index(['city', 'lat', 'lng', 'country', 'population'], dtype='object')

Missing values¶

to_drop = wct[wct.population.isnull()]
to_drop

# dropping missing values from the dataset
wctc = wct.copy(deep=False) 
wctc.dropna(inplace=True)

wctc.isnull().sum() # cleaned dataset

city          0
lat           0
lng           0
country       0
population    0
dtype: int64

Look at the position of the cities with missing data in a map to see if some country is not represented¶

We need to verify if the null values that have been dropped are randomly distributed or there's a hidden pattern. In this way we see if all geographic areas are represented.

import geojson
import folium

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-12-6a09eb20f703> in <module>
----> 1 import geojson
      2 import folium

ModuleNotFoundError: No module named 'geojson'

data_to_plot = list(wctc.columns[3:5])

def df_to_geojson(df, properties, lat='latitude', lon='longitude'):
    """
    Turn a dataframe containing point data into a geojson formatted python dictionary
    
    df : the dataframe to convert to geojson
    properties : a list of columns in the dataframe to turn into geojson feature properties
    lat : the name of the column in the dataframe that contains latitude data
    lon : the name of the column in the dataframe that contains longitude data
    """
    
    # create a new python dict to contain our geojson data, using geojson format
    geojson = {'type':'FeatureCollection', 'features':[]}

    # loop through each row in the dataframe and convert each row to geojson format
    for _, row in df.iterrows():
        # create a feature template to fill in
        feature = {'type':'Feature',
                   'properties':{},
                   'geometry':{'type':'Point',
                               'coordinates':[]}}

        # fill in the coordinates
        feature['geometry']['coordinates'] = [row[lon],row[lat]]

        # for each column, get the value and add it as a new feature property
        for prop in properties:
            feature['properties'][prop] = row[prop]
        
        # add this feature (aka, converted dataframe row) to the list of features inside our dict
        geojson['features'].append(feature)
    
    return geojson

geo = df_to_geojson(to_drop, data_to_plot, lat = "lat", lon = "lng")

m = folium.Map([9,9], zoom_start=2)

folium.GeoJson(geo).add_to(m)

# uncomment below to see the map
# m

<folium.features.GeoJson at 0x7f905d316a30>

to_drop[to_drop["country"]== "Malta"] # all null values

It looks like some countries have more missing information than others.

Retrieve the world biggest cities¶

A big city is categorized as >500,000 residents in the given year, 2020.

big_cities = wctc.sort_values(by=["population"], ascending=False).head(1000)

big_cities.head()

big_cities.to_csv('bigcities.csv',index=False)

geo2 = df_to_geojson(big_cities, data_to_plot, lat = "lat", lon = "lng")
m2 = folium.Map([9,9], zoom_start=2)
folium.GeoJson(geo2).add_to(m2)

# uncomment below to see the map
# m2

<folium.features.GeoJson at 0x7f905e015a30>

We can see that some countries are not represented with this approach. For example, the African State of Namibia.

Retrieve the most populated cities (6 max) for every country¶

countries = list(wctc["country"].unique())
top_cities = pd.DataFrame(columns = wctc.columns)

top_cities = top_cities.append([wct[wct["country"] == country].sort_values(by=["population"], ascending=False).head(6) for country in countries], ignore_index = True)

top_cities

top_cities.to_csv('bigcities_allcountries.csv',index=False)

geo3 = df_to_geojson(top_cities, data_to_plot, lat = "lat", lon = "lng")
m3 = folium.Map([9, 9], zoom_start=2)
folium.GeoJson(geo3).add_to(m3)

# uncomment below to see the map
m3

In this way every country should be represented. Probably also those containing a lot of cities that have been dropped.

	city	city_ascii	lat	lng	country	iso2	iso3	admin_name	capital	population	id
0	Tokyo	Tokyo	35.6897	139.6922	Japan	JP	JPN	Tōkyō	primary	37977000.0	1392685764
1	Jakarta	Jakarta	-6.2146	106.8451	Indonesia	ID	IDN	Jakarta	primary	34540000.0	1360771077
2	Delhi	Delhi	28.6600	77.2300	India	IN	IND	Delhi	admin	29617000.0	1356872604
3	Mumbai	Mumbai	18.9667	72.8333	India	IN	IND	Mahārāshtra	admin	23355000.0	1356226629
4	Manila	Manila	14.5958	120.9772	Philippines	PH	PHL	Manila	primary	23088000.0	1608618140

	city	lat	lng	country	population
824	Al Quds	31.7764	35.2269	West Bank	NaN
827	Ngerulmud	7.5006	134.6242	Palau	NaN
6255	Un'goofaaru	5.6681	73.0302	Maldives	NaN
6393	Banqiao	25.0143	121.4672	Taiwan	NaN
7568	Naifaru	5.4442	73.3662	Maldives	NaN
...	...	...	...	...	...
9469	We	-20.9000	167.2667	New Caledonia	NaN
9470	Presevo	42.3067	21.6500	Serbia	NaN
9471	Bujanovac	42.4667	21.7667	Serbia	NaN
9472	Kitamilo	0.2222	33.2061	Uganda	NaN
9473	Tarrafal	15.2833	-23.7667	Cabo Verde	NaN

	city	lat	lng	country	population
7901	Sliema	35.9125	14.5019	Malta	NaN
8144	Fgura	35.8703	14.5133	Malta	NaN
8151	Hamrun	35.8847	14.4844	Malta	NaN
8238	Senglea	35.8875	14.5169	Malta	NaN
8264	Tarxien	35.8658	14.5150	Malta	NaN
...	...	...	...	...	...
8883	Santa Lucija	36.0431	14.2172	Malta	NaN
8935	Zebbug	36.0722	14.2358	Malta	NaN
8946	Imgarr	35.9206	14.3664	Malta	NaN
8971	Gharb	36.0600	14.2089	Malta	NaN
9035	San Lawrenz	36.0556	14.2036	Malta	NaN

	city	lat	lng	country	population
0	Tokyo	35.6897	139.6922	Japan	37977000.0
1	Jakarta	-6.2146	106.8451	Indonesia	34540000.0
2	Delhi	28.6600	77.2300	India	29617000.0
3	Mumbai	18.9667	72.8333	India	23355000.0
4	Manila	14.5958	120.9772	Philippines	23088000.0

	city	lat	lng	country	population
0	Tokyo	35.6897	139.6922	Japan	37977000.0
1	Osaka	34.6936	135.5019	Japan	14977000.0
2	Nagoya	35.1167	136.9333	Japan	9113000.0
3	Yokohama	35.4333	139.6333	Japan	3748781.0
4	Fukuoka	33.5903	130.4019	Japan	2128000.0
...	...	...	...	...	...
1131	Grand Turk	21.4664	-71.1360	Turks And Caicos Islands	5801.0
1132	Avarua	-21.2070	-159.7710	Cook Islands	5445.0
1133	Vatican City	41.9000	12.4478	Vatican City	825.0
1134	Stanley	-51.7000	-57.8500	Falkland Islands (Islas Malvinas)	2213.0
1135	Grytviken	-54.2806	-36.5080	South Georgia And South Sandwich Islands	99.0